In [1]:
# Importing the libraries

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

# Ignore harmless warnings 

import warnings 
warnings.filterwarnings("ignore")

# Set to display all the columns in dataset

pd.set_option("display.max_columns", None)

# Import psql to run queries 

import pandasql as psql
In [2]:
# Load the dataset information

data = pd.read_excel(r"data(New).xlsx", header=0)

# Copy to back-up file

data_bk =data.copy()

# Display first 5 records

data.head()
Out[2]:
id year institute_type round_no quota pool institute_short program_name program_duration degree_short category opening_rank closing_rank is_preparatory
0 1 2016 IIT 6 AI Gender-Neutral IIT-Bombay Aerospace Engineering 4 Years B.Tech GEN 838 1841 0
1 2 2016 IIT 6 AI Gender-Neutral IIT-Bombay Aerospace Engineering 4 Years B.Tech OBC-NCL 408 1098 0
2 3 2016 IIT 6 AI Gender-Neutral IIT-Bombay Aerospace Engineering 4 Years B.Tech SC 297 468 0
3 4 2016 IIT 6 AI Gender-Neutral IIT-Bombay Aerospace Engineering 4 Years B.Tech ST 79 145 0
4 5 2016 IIT 6 AI Gender-Neutral IIT-Bombay Aerospace Engineering 4 Years B.Tech GEN-PWD 94 94 0
In [3]:
# Display the dataset information

data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64958 entries, 0 to 64957
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                64958 non-null  int64 
 1   year              64958 non-null  int64 
 2   institute_type    64958 non-null  object
 3   round_no          64958 non-null  int64 
 4   quota             64958 non-null  object
 5   pool              64958 non-null  object
 6   institute_short   64958 non-null  object
 7   program_name      64958 non-null  object
 8   program_duration  64958 non-null  object
 9   degree_short      64958 non-null  object
 10  category          64958 non-null  object
 11  opening_rank      64958 non-null  int64 
 12  closing_rank      64958 non-null  int64 
 13  is_preparatory    64958 non-null  int64 
dtypes: int64(6), object(8)
memory usage: 6.9+ MB
In [4]:
# display the unique values of the all the variables
data.nunique()
Out[4]:
id                  25458
year                    6
institute_type          2
round_no                4
quota                   7
pool                    2
institute_short        54
program_name          130
program_duration        2
degree_short           13
category               10
opening_rank        10984
closing_rank        11940
is_preparatory          2
dtype: int64
In [6]:
# display the shape of the dataset
data.shape
Out[6]:
(64958, 14)
In [7]:
# display the duplicated values with in  dataset
data.duplicated().any()
Out[7]:
True
In [8]:
# display duplicate values with in dataset
data_dup=data[data.duplicated(keep='last')]
# disply the duplicate records
data_dup
Out[8]:
id year institute_type round_no quota pool institute_short program_name program_duration degree_short category opening_rank closing_rank is_preparatory
9205 9206 2021 IIT 1 AI Gender-Neutral IIT-Bombay Aerospace Engineering 4 Years B.Tech GEN 123 2003 0
9206 9207 2021 IIT 1 AI Female-Only IIT-Bombay Aerospace Engineering 4 Years B.Tech GEN 702 4419 0
9207 9208 2021 IIT 1 AI Gender-Neutral IIT-Bombay Aerospace Engineering 4 Years B.Tech OBC-NCL 389 1123 0
9208 9209 2021 IIT 1 AI Female-Only IIT-Bombay Aerospace Engineering 4 Years B.Tech OBC-NCL 1618 2505 0
9209 9210 2021 IIT 1 AI Gender-Neutral IIT-Bombay Aerospace Engineering 4 Years B.Tech SC 129 579 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
64903 31136 2021 NIT 1 JK Female-Only NIT-Srinagar Electronics and Communication Engineering 4 Years B.Tech SC 14185 24048 0
64904 31137 2021 NIT 1 JK Gender-Neutral NIT-Srinagar Electronics and Communication Engineering 4 Years B.Tech ST 2736 4171 0
64905 31138 2021 NIT 1 JK Female-Only NIT-Srinagar Electronics and Communication Engineering 4 Years B.Tech ST 10870 10870 0
64906 31139 2021 NIT 1 LA Gender-Neutral NIT-Srinagar Electronics and Communication Engineering 4 Years B.Tech GEN 166453 265454 0
64907 31140 2021 NIT 1 LA Female-Only NIT-Srinagar Electronics and Communication Engineering 4 Years B.Tech GEN 215054 215054 0

39500 rows × 14 columns

In [9]:
# remove the identified duplicate records
data=data.drop_duplicates()

# display the shape of the dataset
data.shape
Out[9]:
(25458, 14)
In [10]:
# Re-setting the raw index
data=data.reset_index(drop=True)
# copy file to backup file after deletion of duplicate records
data_bk2=data.copy()
In [11]:
# display the duplicated  values in the dataset
data.duplicated().any()
Out[11]:
False
In [12]:
# display the missing values information of variables
data.isnull().sum()
Out[12]:
id                  0
year                0
institute_type      0
round_no            0
quota               0
pool                0
institute_short     0
program_name        0
program_duration    0
degree_short        0
category            0
opening_rank        0
closing_rank        0
is_preparatory      0
dtype: int64
In [13]:
# display the descriptive status
data.describe()
Out[13]:
id year round_no opening_rank closing_rank is_preparatory
count 25458.000000 25458.000000 25458.000000 2.545800e+04 2.545800e+04 25458.000000
mean 15065.188978 2019.524118 4.864993 8.347711e+03 1.100359e+04 0.035706
std 9630.192936 1.431272 2.530553 2.946525e+04 4.170573e+04 0.185559
min 1.000000 2016.000000 1.000000 0.000000e+00 0.000000e+00 0.000000
25% 6365.250000 2019.000000 1.000000 6.550000e+02 8.260000e+02 0.000000
50% 12729.500000 2020.000000 6.000000 2.237000e+03 2.715000e+03 0.000000
75% 23803.750000 2021.000000 7.000000 6.781750e+03 8.155500e+03 0.000000
max 31140.000000 2021.000000 7.000000 1.082601e+06 1.144790e+06 1.000000
In [14]:
# To find outliers
first_quantile=data['id'].quantile(.25)
third_quantile=data['id'].quantile(.75)
IQR=third_quantile-first_quantile
upper_bound=round(third_quantile+1.5*IQR,3)
upper_bound
lower_bound=round(first_quantile-1.5*IQR,3)
lower_bound
data[(data.id < lower_bound) | (data.id > upper_bound)]
Out[14]:
id year institute_type round_no quota pool institute_short program_name program_duration degree_short category opening_rank closing_rank is_preparatory
In [15]:
# To find out the  outliers
first_quantile=data['year'].quantile(.25)
third_quantile=data['year'].quantile(.75)
IQR=third_quantile-first_quantile
upper_bound=round(third_quantile+1.5*IQR,3)
upper_bound
lower_bound=round(first_quantile-1.5*IQR,3)
lower_bound
data[(data.year < lower_bound) | (data.year > upper_bound)]
Out[15]:
id year institute_type round_no quota pool institute_short program_name program_duration degree_short category opening_rank closing_rank is_preparatory
In [16]:
# To find outliers
first_quantile=data['round_no'].quantile(.25)
third_quantile=data['round_no'].quantile(.75)
IQR=third_quantile-first_quantile
upper_bound=round(third_quantile+1.5*IQR,3)
upper_bound
lower_bound=round(first_quantile-1.5*IQR,3)
lower_bound
data[(data.round_no < lower_bound) | (data.round_no > upper_bound)]
Out[16]:
id year institute_type round_no quota pool institute_short program_name program_duration degree_short category opening_rank closing_rank is_preparatory
In [18]:
# display the institute_type variables count
data['institute_type'].value_counts() 
Out[18]:
IIT    13155
NIT    12303
Name: institute_type, dtype: int64
In [19]:
# replace the 'institute_type' varaible and covert to integer value
data['institute_type']=data['institute_type'].str.replace('IIT','1')
data['institute_type']=data['institute_type'].str.replace('NIT','0')
data['institute_type']=data['institute_type'].astype(int)
In [20]:
# display the institute_type variables count
data['institute_type'].value_counts() 
Out[20]:
1    13155
0    12303
Name: institute_type, dtype: int64
In [21]:
# display the pool variables count
data['pool'].value_counts()
Out[21]:
Gender-Neutral    16005
Female-Only        9453
Name: pool, dtype: int64
In [22]:
# replace the 'pool' varaible and covert to integer value
data['pool']=data['pool'].str.replace('Gender-Neutral','1')
data['pool']=data['pool'].str.replace('Female-Only','0')
data['pool']=data['pool'].astype(int)
In [23]:
# display the pool variables count
data['pool'].value_counts()
Out[23]:
1    16005
0     9453
Name: pool, dtype: int64
In [24]:
# display the program_duration variables count
data['program_duration'].value_counts()
Out[24]:
4 Years    21104
5 Years     4354
Name: program_duration, dtype: int64
In [25]:
# replace the 'program_duration' varaible and covert to integer value
data['program_duration']=data['program_duration'].str.replace('4 Years','1')
data['program_duration']=data['program_duration'].str.replace('5 Years','0')
data['program_duration']=data['program_duration'].astype(int)
In [26]:
# display the program_duration variables count
data['program_duration'].value_counts()
Out[26]:
1    21104
0     4354
Name: program_duration, dtype: int64
In [27]:
# display the quota  variables count
data['quota'].value_counts()
Out[27]:
AI    13155
OS     6502
HS     5486
JK      128
GO       95
AP       72
LA       20
Name: quota, dtype: int64
In [29]:
#  use the labelEncoder to handle categorical data
from sklearn.preprocessing import LabelEncoder
LE= LabelEncoder()
data['quota']=LE.fit_transform(data[['quota']])
In [30]:
# display the institute_short  variables count
data['institute_short'].value_counts()
Out[30]:
IIT-Kharagpur              2120
IIT-(BHU) Varanasi         1088
NIT-Rourkela               1054
IIT-Bombay                 1034
IIT-Delhi                  1018
IIT-Roorkee                 989
IIT-Madras                  949
IIT-Kanpur                  844
NIT-Raipur                  748
IIT-(ISM) Dhanbad           739
NIT-Calicut                 695
NIT-Hamirpur                686
NIT-Jalandhar               649
NIT-Karnataka-Surathkal     632
NIT-Bhopal                  624
NIT-Durgapur                605
NIT-Allahabad               605
IIT-Bhubaneswar             582
NIT-Agartala                561
NIT-Jaipur                  549
IIT-Guwahati                531
IIT-Hyderabad               484
NIT-Kurukshetra             477
NIT-Patna                   468
NIT-Jamshedpur              460
NIT-Srinagar                439
NIT-Silchar                 409
IIT-Ropar                   311
NIT-Warangal                306
NIT-Tiruchirappalli         303
IIT-Patna                   290
IIT-Mandi                   275
IIT-Gandhinagar             272
IIT-Jodhpur                 265
NIT-Goa                     264
IIT-Indore                  244
IIT-Jammu                   240
NIT-Puducherry              238
IIT-Tirupati                230
IIT-Palakkad                190
NIT-Arunachal-Pradesh       188
NIT-Manipur                 188
NIT-Meghalaya               175
NIT-Delhi                   162
IIT-Goa                     161
NIT-Nagaland                160
IIT-Bhilai                  155
NIT-Mizoram                 153
NIT-Sikkim                  147
IIT-Dharwad                 144
NIT-Uttarakhand              99
NIT-Surat                    93
NIT-Nagpur                   92
NIT-Andhra-Pradesh           74
Name: institute_short, dtype: int64
In [31]:
#  use the labelEncoder to handle categorical data
from sklearn.preprocessing import LabelEncoder
LE= LabelEncoder()
data['institute_short']=LE.fit_transform(data[['institute_short']])
In [32]:
# display the program_name variables count
data['program_name'].value_counts()
Out[32]:
Computer Science and Engineering                                                                          3330
Mechanical Engineering                                                                                    2774
Civil Engineering                                                                                         2566
Electrical Engineering                                                                                    2279
Electronics and Communication Engineering                                                                 1869
                                                                                                          ... 
Manufacturing Science and Engineering with M.Tech. in Industrial andSystems Engineering and Management       7
Industrial and Systems Engineering with M.Tech. in Industrial and SystemsEngineering and Management          7
Agricultural and Food Engineering with M.Tech. in any of the listedspecializations                           7
Engineering Physics and M.Tech. with specialization in Nano Science                                          5
Civil Engineering with M.Tech. in Structural Engineering                                                     4
Name: program_name, Length: 130, dtype: int64
In [33]:
#  use the labelEncoder to handle categorical data
from sklearn.preprocessing import LabelEncoder
LE= LabelEncoder()
data['program_name']=LE.fit_transform(data[['program_name']])
In [34]:
# display the degree_short variables count
data['degree_short'].value_counts()
Out[34]:
B.Tech                   20456
B.Tech + M.Tech (IDD)     2560
BSc                        590
B.Arch                     538
Int MSc.                   298
Btech + M.Tech (IDD)       293
Int M.Tech                 249
Int Msc.                   233
BS + MS (IDD)              110
BSc + MSc (IDD)             69
B.Plan                      54
B.Pharm                      4
B.Pharm + M.Pharm            4
Name: degree_short, dtype: int64
In [35]:
#  use the labelEncoder to handle categorical data
from sklearn.preprocessing import LabelEncoder
LE= LabelEncoder()
data['degree_short']=LE.fit_transform(data[['degree_short']])
In [36]:
# display the category variables count
data['category'].value_counts()
Out[36]:
GEN            5252
OBC-NCL        4986
SC             4908
ST             4327
GEN-EWS        3205
GEN-PWD        1565
OBC-NCL-PWD     770
GEN-EWS-PWD     185
SC-PWD          182
ST-PWD           78
Name: category, dtype: int64
In [37]:
#  use the labelEncoder to handle categorical data
from sklearn.preprocessing import LabelEncoder
LE= LabelEncoder()
data['category']=LE.fit_transform(data[['category']])
In [38]:
# Display the dataset information

data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25458 entries, 0 to 25457
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   id                25458 non-null  int64
 1   year              25458 non-null  int64
 2   institute_type    25458 non-null  int32
 3   round_no          25458 non-null  int64
 4   quota             25458 non-null  int64
 5   pool              25458 non-null  int32
 6   institute_short   25458 non-null  int32
 7   program_name      25458 non-null  int32
 8   program_duration  25458 non-null  int32
 9   degree_short      25458 non-null  int32
 10  category          25458 non-null  int32
 11  opening_rank      25458 non-null  int64
 12  closing_rank      25458 non-null  int64
 13  is_preparatory    25458 non-null  int64
dtypes: int32(7), int64(7)
memory usage: 2.0 MB
In [39]:
# display the sample sataset
data.sample(5)
Out[39]:
id year institute_type round_no quota pool institute_short program_name program_duration degree_short category opening_rank closing_rank is_preparatory
11145 11146 2021 1 1 0 0 21 47 1 4 8 406 406 1
19082 23791 2020 0 6 6 1 30 110 1 4 0 36295 41685 0
6465 6466 2019 1 7 0 1 12 28 1 4 6 2274 2365 0
24895 30577 2021 0 1 3 0 46 26 1 4 6 7792 7792 0
24127 29810 2021 0 1 6 0 45 47 1 4 0 23670 26634 0
In [40]:
# Count the target or dependent variable by '0' & '1' and their proportion 
# (>= 10 : 1, then the dataset is imbalance data)

is_preparatory_count = data.is_preparatory.value_counts()
print('Class 0:', is_preparatory_count[0])
print('Class 1:', is_preparatory_count[1])
print('Proportion:', round(is_preparatory_count[0] / is_preparatory_count[1], 2), ': 1')
print('Total IIT-NIT Data records:', len(data))
Class 0: 24549
Class 1: 909
Proportion: 27.01 : 1
Total IIT-NIT Data records: 25458
In [41]:
# Identify the independent and target (dependent) variables

Indepvar=[]
for col in data.columns:
    if col != 'is_preparatory':
        Indepvar.append(col)
        
TargetVar='is_preparatory'

x=data[Indepvar]

y=data[TargetVar]
In [42]:
# Random oversampling can be implemented using the RandomOverSampler class

from imblearn.over_sampling import RandomOverSampler

oversample = RandomOverSampler(sampling_strategy=0.125)

x_over, y_over = oversample.fit_resample(x, y)

print(x_over.shape)
print(y_over.shape)
(27617, 13)
(27617,)
In [43]:
# Random oversampling can be implemented using the RandomOverSampler class

from imblearn.over_sampling import RandomOverSampler

oversample = RandomOverSampler(sampling_strategy=0.125)

x_over, y_over = oversample.fit_resample(x, y)

print(x_over.shape)
print(y_over.shape)
(27617, 13)
(27617,)
In [44]:
# split the data into train and test (random sampling)

#70% data train and 30% data test

from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)

# display yhe shape for train and test data

x_train.shape,x_test.shape,y_train.shape,y_test.shape
Out[44]:
((17820, 13), (7638, 13), (17820,), (7638,))
In [45]:
# Scaling the features by using MinMaxScaler

from sklearn.preprocessing import MinMaxScaler

mmscaler = MinMaxScaler(feature_range=(0, 1))

x_train= mmscaler.fit_transform(x_train)
x_train = pd.DataFrame(x_train)

x_test = mmscaler.fit_transform(x_test)
x_test = pd.DataFrame(x_test)

Logestic Regression Algorithm(Classifier)¶

In [46]:
# To build the 'Logistic Regression' model with random sampling 
from sklearn.linear_model import LogisticRegression
# create an object for model
ModelLR= LogisticRegression()
# train the model
ModelLR.fit(x_train,y_train)
# predict themodel with test the dataset
y_pred=ModelLR.predict(x_test)
y_pred_prob=ModelLR.predict_proba(x_test)
In [47]:
#To display the algorithm paramaters
params=ModelLR.get_params()
print(params)
{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
In [48]:
# Confusion matrix in sklearn

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Actual values

actual = y_test

# Predicted values

predicted = y_pred

# Confusion matrix

matrix = confusion_matrix(actual,predicted, labels=[1,0], sample_weight=None, normalize=None)
print('Confusion matrix : \n', matrix)

# Outcome values order in sklearn

tp, fn, fp, tn = confusion_matrix(actual,predicted,labels=[1,0]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)

# Classification report for precision, recall f1-score and accuracy

C_Report = classification_report(actual,predicted,labels=[1,0])

print('Classification report : \n', C_Report)

# Calculating the metrics

sensitivity = round(tp/(tp+fn), 3);
specificity = round(tn/(tn+fp), 3);
accuracy = round((tp+tn)/(tp+fp+tn+fn), 3);
balanced_accuracy = round((sensitivity+specificity)/2, 3);
    
precision = round(tp/(tp+fp), 3);
f1Score = round((2*tp/(2*tp + fp + fn)), 3);

# Matthews Correlation Coefficient (MCC). Range of values of MCC lie between -1 to +1. 
# A model with a score of +1 is a perfect model and -1 is a poor model

from math import sqrt

mx = (tp+fp) * (tp+fn) * (tn+fp) * (tn+fn)
MCC = round(((tp * tn) - (fp * fn)) / sqrt(mx), 3)

print('Accuracy :', round(accuracy*100, 2),'%')
print('Precision :', round(precision*100, 2),'%')
print('Recall :', round(sensitivity*100,2), '%')
print('F1 Score :', f1Score)
print('Specificity or True Negative Rate :', round(specificity*100,2), '%')
print('Balanced Accuracy :', round(balanced_accuracy*100, 2),'%')
print('MCC :', MCC)
# Area under ROC curve 

from sklearn.metrics import roc_curve, roc_auc_score

print('roc_auc_score:', round(roc_auc_score(actual, predicted), 3))

# ROC Curve

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
model_roc_auc = roc_auc_score(actual, predicted)
fpr, tpr, thresholds = roc_curve(actual, ModelLR.predict_proba(x_test)[:,1])
plt.figure()
#----------------------------------------------------
plt.plot(fpr, tpr, label= 'Classification Model' % model_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show() 
print('-----------------------------------------------------------------------------------------------------')
Confusion matrix : 
 [[   0  262]
 [   0 7376]]
Outcome values : 
 0 262 0 7376
Classification report : 
               precision    recall  f1-score   support

           1       0.00      0.00      0.00       262
           0       0.97      1.00      0.98      7376

    accuracy                           0.97      7638
   macro avg       0.48      0.50      0.49      7638
weighted avg       0.93      0.97      0.95      7638

Accuracy : 96.6 %
Precision : nan %
Recall : 0.0 %
F1 Score : 0.0
Specificity or True Negative Rate : 100.0 %
Balanced Accuracy : 50.0 %
MCC : nan
roc_auc_score: 0.5
-----------------------------------------------------------------------------------------------------

Decision Tree Algorithm(Classifier)¶

In [49]:
# To build the 'Decision tree algorithm' model with random sampling 
from sklearn.tree import DecisionTreeClassifier
# create an object for model
ModelDT=DecisionTreeClassifier()
# train the model
ModelDT.fit(x_train,y_train)
# predict themodel with test the dataset
y_pred=ModelDT.predict(x_test)
y_pred_prob=ModelDT.predict_proba(x_test)
In [50]:
#To display the algorithm paramaters
params=ModelDT.get_params()
print(params)
{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': None, 'splitter': 'best'}
In [51]:
# Confusion matrix in sklearn

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Actual values

actual = y_test

# Predicted values

predicted = y_pred

# Confusion matrix

matrix = confusion_matrix(actual,predicted, labels=[1,0], sample_weight=None, normalize=None)
print('Confusion matrix : \n', matrix)

# Outcome values order in sklearn

tp, fn, fp, tn = confusion_matrix(actual,predicted,labels=[1,0]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)

# Classification report for precision, recall f1-score and accuracy

C_Report = classification_report(actual,predicted,labels=[1,0])

print('Classification report : \n', C_Report)

# Calculating the metrics

sensitivity = round(tp/(tp+fn), 3);
specificity = round(tn/(tn+fp), 3);
accuracy = round((tp+tn)/(tp+fp+tn+fn), 3);
balanced_accuracy = round((sensitivity+specificity)/2, 3);
    
precision = round(tp/(tp+fp), 3);
f1Score = round((2*tp/(2*tp + fp + fn)), 3);

# Matthews Correlation Coefficient (MCC). Range of values of MCC lie between -1 to +1. 
# A model with a score of +1 is a perfect model and -1 is a poor model

from math import sqrt

mx = (tp+fp) * (tp+fn) * (tn+fp) * (tn+fn)
MCC = round(((tp * tn) - (fp * fn)) / sqrt(mx), 3)

print('Accuracy :', round(accuracy*100, 2),'%')
print('Precision :', round(precision*100, 2),'%')
print('Recall :', round(sensitivity*100,2), '%')
print('F1 Score :', f1Score)
print('Specificity or True Negative Rate :', round(specificity*100,2), '%')
print('Balanced Accuracy :', round(balanced_accuracy*100, 2),'%')
print('MCC :', MCC)
# Area under ROC curve 

from sklearn.metrics import roc_curve, roc_auc_score

print('roc_auc_score:', round(roc_auc_score(actual, predicted), 3))

# ROC Curve

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
model_roc_auc = roc_auc_score(actual, predicted)
fpr, tpr, thresholds = roc_curve(actual, ModelDT.predict_proba(x_test)[:,1])
plt.figure()
#----------------------------------------------------
plt.plot(fpr, tpr, label= 'Classification Model' % model_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show() 
print('-----------------------------------------------------------------------------------------------------')
Confusion matrix : 
 [[ 170   92]
 [ 218 7158]]
Outcome values : 
 170 92 218 7158
Classification report : 
               precision    recall  f1-score   support

           1       0.44      0.65      0.52       262
           0       0.99      0.97      0.98      7376

    accuracy                           0.96      7638
   macro avg       0.71      0.81      0.75      7638
weighted avg       0.97      0.96      0.96      7638

Accuracy : 95.9 %
Precision : 43.8 %
Recall : 64.9 %
F1 Score : 0.523
Specificity or True Negative Rate : 97.0 %
Balanced Accuracy : 81.0 %
MCC : 0.513
roc_auc_score: 0.81
-----------------------------------------------------------------------------------------------------
In [52]:
# plot the decision tree

import matplotlib.pyplot as plt
from sklearn import tree

plt.figure(figsize=(20,5))
tree.plot_tree(ModelDT);

Random Forest Algorithm(Classifier)¶

In [53]:
# To build the 'Random forest algorithm' model with random sampling 
from sklearn.ensemble import RandomForestClassifier
# create an object for model
ModelRF= RandomForestClassifier()
# train the model
ModelRF.fit(x_train,y_train)
# predict themodel with test the dataset
y_pred=ModelRF.predict(x_test)
y_pred_prob=ModelRF.predict_proba(x_test)
In [54]:
#To display the algorithm paramaters
params=ModelRF.get_params()
print(params)
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
In [55]:
# Confusion matrix in sklearn

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Actual values

actual = y_test

# Predicted values

predicted = y_pred

# Confusion matrix

matrix = confusion_matrix(actual,predicted, labels=[1,0], sample_weight=None, normalize=None)
print('Confusion matrix : \n', matrix)

# Outcome values order in sklearn

tp, fn, fp, tn = confusion_matrix(actual,predicted,labels=[1,0]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)

# Classification report for precision, recall f1-score and accuracy

C_Report = classification_report(actual,predicted,labels=[1,0])

print('Classification report : \n', C_Report)

# Calculating the metrics

sensitivity = round(tp/(tp+fn), 3);
specificity = round(tn/(tn+fp), 3);
accuracy = round((tp+tn)/(tp+fp+tn+fn), 3);
balanced_accuracy = round((sensitivity+specificity)/2, 3);
    
precision = round(tp/(tp+fp), 3);
f1Score = round((2*tp/(2*tp + fp + fn)), 3);

# Matthews Correlation Coefficient (MCC). Range of values of MCC lie between -1 to +1. 
# A model with a score of +1 is a perfect model and -1 is a poor model

from math import sqrt

mx = (tp+fp) * (tp+fn) * (tn+fp) * (tn+fn)
MCC = round(((tp * tn) - (fp * fn)) / sqrt(mx), 3)

print('Accuracy :', round(accuracy*100, 2),'%')
print('Precision :', round(precision*100, 2),'%')
print('Recall :', round(sensitivity*100,2), '%')
print('F1 Score :', f1Score)
print('Specificity or True Negative Rate :', round(specificity*100,2), '%')
print('Balanced Accuracy :', round(balanced_accuracy*100, 2),'%')
print('MCC :', MCC)
# Area under ROC curve 

from sklearn.metrics import roc_curve, roc_auc_score

print('roc_auc_score:', round(roc_auc_score(actual, predicted), 3))

# ROC Curve

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
model_roc_auc = roc_auc_score(actual, predicted)
fpr, tpr, thresholds = roc_curve(actual, ModelRF.predict_proba(x_test)[:,1])
plt.figure()
#----------------------------------------------------
plt.plot(fpr, tpr, label= 'Classification Model' % model_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show() 
print('-----------------------------------------------------------------------------------------------------')
Confusion matrix : 
 [[ 143  119]
 [  37 7339]]
Outcome values : 
 143 119 37 7339
Classification report : 
               precision    recall  f1-score   support

           1       0.79      0.55      0.65       262
           0       0.98      0.99      0.99      7376

    accuracy                           0.98      7638
   macro avg       0.89      0.77      0.82      7638
weighted avg       0.98      0.98      0.98      7638

Accuracy : 98.0 %
Precision : 79.4 %
Recall : 54.6 %
F1 Score : 0.647
Specificity or True Negative Rate : 99.5 %
Balanced Accuracy : 77.0 %
MCC : 0.649
roc_auc_score: 0.77
-----------------------------------------------------------------------------------------------------

Extra Tree Algorithm(Classifier)¶

In [56]:
# To build the 'Random Forest' model with random sampling

from sklearn.ensemble import ExtraTreesClassifier

# Create an object for Extra Trees Classifier 

ModelET = ExtraTreesClassifier()

# Train the model with train data 

ModelET.fit(x_train,y_train)

# Predict the model with test data set

y_pred = ModelET.predict(x_test)
y_pred_prob = ModelET.predict_proba(x_test)

# Confusion matrix in sklearn

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# actual values

actual = y_test

# predicted values

predicted = y_pred

# confusion matrix

matrix = confusion_matrix(actual, predicted, labels=[1,0],sample_weight=None, normalize=None)
print('Confusion matrix : \n', matrix)

# outcome values order in sklearn

tp, fn, fp, tn = confusion_matrix(actual, predicted, labels=[1,0]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy

C_Report = classification_report(actual, predicted, labels=[1,0])

print('Classification report : \n', C_Report)

# calculating the metrics

sensitivity = round(tp/(tp+fn), 3);
specificity = round(tn/(tn+fp), 3);
accuracy = round((tp+tn)/(tp+fp+tn+fn), 3);
balanced_accuracy = round((sensitivity+specificity)/2, 3);
precision = round(tp/(tp+fp), 3);
f1Score = round((2*tp/(2*tp + fp + fn)), 3);

# Matthews Correlation Coefficient (MCC). Range of values of MCC lie between -1 to +1. 
# A model with a score of +1 is a perfect model and -1 is a poor model

from math import sqrt

mx = (tp+fp) * (tp+fn) * (tn+fp) * (tn+fn)
MCC = round(((tp * tn) - (fp * fn)) / sqrt(mx), 3)

print('Accuracy :', round(accuracy*100, 2),'%')
print('Precision :', round(precision*100, 2),'%')
print('Recall :', round(sensitivity*100,2), '%')
print('F1 Score :', f1Score)
print('Specificity or True Negative Rate :', round(specificity*100,2), '%'  )
print('Balanced Accuracy :', round(balanced_accuracy*100, 2),'%')
print('MCC :', MCC)

# Area under ROC curve 

from sklearn.metrics import roc_curve, roc_auc_score

print('roc_auc_score:', round(roc_auc_score(actual, predicted), 3))

# ROC Curve

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
model_roc_auc = roc_auc_score(actual, predicted)
fpr, tpr, thresholds = roc_curve(actual, ModelET.predict_proba(x_test)[:,1])
plt.figure()
#--------------------------------------------------------------------
plt.plot(fpr, tpr, label= 'Classification Model' % model_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show() 
print('-----------------------------------------------------------------------------------------------------')
Confusion matrix : 
 [[ 117  145]
 [  50 7326]]
Outcome values : 
 117 145 50 7326
Classification report : 
               precision    recall  f1-score   support

           1       0.70      0.45      0.55       262
           0       0.98      0.99      0.99      7376

    accuracy                           0.97      7638
   macro avg       0.84      0.72      0.77      7638
weighted avg       0.97      0.97      0.97      7638

Accuracy : 97.4 %
Precision : 70.1 %
Recall : 44.7 %
F1 Score : 0.545
Specificity or True Negative Rate : 99.3 %
Balanced Accuracy : 72.0 %
MCC : 0.547
roc_auc_score: 0.72
-----------------------------------------------------------------------------------------------------
In [57]:
# display the all the variables
data.columns
Out[57]:
Index(['id', 'year', 'institute_type', 'round_no', 'quota', 'pool',
       'institute_short', 'program_name', 'program_duration', 'degree_short',
       'category', 'opening_rank', 'closing_rank', 'is_preparatory'],
      dtype='object')
In [58]:
# Create a list for plotting the decision trees

figcols = ['id', 'year', 'institute_type', 'round_no', 'quota', 'pool',
       'institute_short', 'program_name', 'program_duration', 'degree_short',
       'category', 'opening_rank', 'closing_rank', 'is_preparatory']
In [59]:
# Visualize individual trees and code below visualizes the first decision tree of Extra Trees Classifier

from sklearn import tree

fn1=figcols
cn1=['0', '1']

fig, axes = plt.subplots(nrows = 1, ncols = 1, figsize = (4,4), dpi=800)
tree.plot_tree(ModelET.estimators_[0],
               feature_names = fn1, 
               class_names=cn1,
               filled = True);
#fig.savefig('ModelET.png')
In [60]:
# Visualize individual trees and code below visualizes the first 5 decision trees of Extra Trees Classifier

from sklearn import tree

fn2=figcols
cn2=['0', '1']

fig, axes = plt.subplots(nrows = 1, ncols = 5, figsize = (10,2), dpi=3000)
for index in range(0, 5):
    tree.plot_tree(ModelET.estimators_[index],
                   feature_names = fn2, 
                   class_names=cn2,
                   filled = True,
                   ax = axes[index]);
    
    axes[index].set_title('Estimator: ' + str(index), fontsize = 11)
#fig.savefig('ModelET1.png')

KNN Algorithm¶

In [61]:
# load the KNNResults
KNNResults = pd.read_excel(r"KNN_ResultsNew.xlsx", header=0)
KNNResults.head()
Out[61]:
Model Name KNN K Value True_Positive False_Negative False_Positive True_Negative Accuracy Precision Recall F1 Score Specificity MCC ROC_AUC_Score Balanced Accuracy
In [62]:
# Build KNN Model

from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import sklearn.metrics as metrics

from sklearn.metrics import roc_curve, roc_auc_score

accuracy = []

for a in range(1, 21, 1):
    
    k = a
    
    # Build the model
    
    ModelKNN = KNeighborsClassifier(n_neighbors=k)
    
    # Train the model
    
    ModelKNN.fit(x_train, y_train)
    
    # Predict the model
    
    y_pred = ModelKNN.predict(x_test)
    y_pred_prob = ModelKNN.predict_proba(x_test)
    
    print('KNN_K_value = ', a)
    
    # Print the model name
    
    print('Model Name: ', ModelKNN)
    
    # confusion matrix in sklearn
    
    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import classification_report
    
    # actual values
    
    actual = y_test
    
    # predicted values
    
    predicted = y_pred
    
    # confusion matrix
    
    matrix = confusion_matrix(actual,predicted, labels=[1,0],sample_weight=None, normalize=None)
    print('Confusion matrix : \n', matrix)
    
    # outcome values order in sklearn
    
    tp, fn, fp, tn = confusion_matrix(actual,predicted,labels=[1,0]).reshape(-1)
    print('Outcome values : \n', tp, fn, fp, tn)
    
    # classification report for precision, recall f1-score and accuracy
    
    C_Report = classification_report(actual,predicted,labels=[1,0])
    
    print('Classification report : \n', C_Report)
    
    # calculating the metrics
    
    sensitivity = round(tp/(tp+fn), 3);
    specificity = round(tn/(tn+fp), 3);
    accuracy = round((tp+tn)/(tp+fp+tn+fn), 3);
    balanced_accuracy = round((sensitivity+specificity)/2, 3);
    
    precision = round(tp/(tp+fp), 3);
    f1Score = round((2*tp/(2*tp + fp + fn)), 3);
    
    # Matthews Correlation Coefficient (MCC). Range of values of MCC lie between -1 to +1. 
    # A model with a score of +1 is a perfect model and -1 is a poor model
    
    from math import sqrt
    
    mx = (tp+fp) * (tp+fn) * (tn+fp) * (tn+fn)
    MCC = round(((tp * tn) - (fp * fn)) / sqrt(mx), 3)
    
    print('Accuracy :', round(accuracy*100, 2),'%')
    print('Precision :', round(precision*100, 2),'%')
    print('Recall :', round(sensitivity*100,2), '%')
    print('F1 Score :', f1Score)
    print('Specificity or True Negative Rate :', round(specificity*100,2), '%'  )
    print('Balanced Accuracy :', round(balanced_accuracy*100, 2),'%')
    print('MCC :', MCC)
    
    # Area under ROC curve 
    
    from sklearn.metrics import roc_curve, roc_auc_score
    
    print('roc_auc_score:', round(roc_auc_score(actual, predicted), 3))
    
    # ROC Curve
    
    from sklearn.metrics import roc_auc_score
    from sklearn.metrics import roc_curve
    model_roc_auc = roc_auc_score(actual, predicted)
    fpr, tpr, thresholds = roc_curve(actual, ModelKNN.predict_proba(x_test)[:,1])
    plt.figure()
    # plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
    plt.plot(fpr, tpr, label= 'Classification Model' % model_roc_auc)
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    #plt.savefig('Log_ROC')
    plt.show()
    #------------------------------------------------------------------------------
    new_row = {'Model Name' : ModelKNN,
               'KNN K Value' : a,
               'True_Positive' : tp,
               'False_Negative' : fn,
               'False_Positive' : fp,
               'True_Negative' : tn,
               'Accuracy' : accuracy,
               'Precision' : precision,
               'Recall' : sensitivity,
               'F1 Score' : f1Score,
               'Specificity' : specificity,
               'MCC':MCC,
               'ROC_AUC_Score':roc_auc_score(actual, predicted),
               'Balanced Accuracy':balanced_accuracy}
    KNNResults = KNNResults.append(new_row, ignore_index=True)
    #------KNN_Results------------------------------------------------------------------------
KNN_K_value =  1
Model Name:  KNeighborsClassifier(n_neighbors=1)
Confusion matrix : 
 [[ 140  122]
 [ 102 7274]]
Outcome values : 
 140 122 102 7274
Classification report : 
               precision    recall  f1-score   support

           1       0.58      0.53      0.56       262
           0       0.98      0.99      0.98      7376

    accuracy                           0.97      7638
   macro avg       0.78      0.76      0.77      7638
weighted avg       0.97      0.97      0.97      7638

Accuracy : 97.1 %
Precision : 57.9 %
Recall : 53.4 %
F1 Score : 0.556
Specificity or True Negative Rate : 98.6 %
Balanced Accuracy : 76.0 %
MCC : 0.541
roc_auc_score: 0.76
KNN_K_value =  2
Model Name:  KNeighborsClassifier(n_neighbors=2)
Confusion matrix : 
 [[  63  199]
 [  43 7333]]
Outcome values : 
 63 199 43 7333
Classification report : 
               precision    recall  f1-score   support

           1       0.59      0.24      0.34       262
           0       0.97      0.99      0.98      7376

    accuracy                           0.97      7638
   macro avg       0.78      0.62      0.66      7638
weighted avg       0.96      0.97      0.96      7638

Accuracy : 96.8 %
Precision : 59.4 %
Recall : 24.0 %
F1 Score : 0.342
Specificity or True Negative Rate : 99.4 %
Balanced Accuracy : 61.7 %
MCC : 0.365
roc_auc_score: 0.617
KNN_K_value =  3
Model Name:  KNeighborsClassifier(n_neighbors=3)
Confusion matrix : 
 [[ 106  156]
 [  77 7299]]
Outcome values : 
 106 156 77 7299
Classification report : 
               precision    recall  f1-score   support

           1       0.58      0.40      0.48       262
           0       0.98      0.99      0.98      7376

    accuracy                           0.97      7638
   macro avg       0.78      0.70      0.73      7638
weighted avg       0.97      0.97      0.97      7638

Accuracy : 96.9 %
Precision : 57.9 %
Recall : 40.5 %
F1 Score : 0.476
Specificity or True Negative Rate : 99.0 %
Balanced Accuracy : 69.8 %
MCC : 0.469
roc_auc_score: 0.697
KNN_K_value =  4
Model Name:  KNeighborsClassifier(n_neighbors=4)
Confusion matrix : 
 [[  62  200]
 [  35 7341]]
Outcome values : 
 62 200 35 7341
Classification report : 
               precision    recall  f1-score   support

           1       0.64      0.24      0.35       262
           0       0.97      1.00      0.98      7376

    accuracy                           0.97      7638
   macro avg       0.81      0.62      0.66      7638
weighted avg       0.96      0.97      0.96      7638

Accuracy : 96.9 %
Precision : 63.9 %
Recall : 23.7 %
F1 Score : 0.345
Specificity or True Negative Rate : 99.5 %
Balanced Accuracy : 61.6 %
MCC : 0.377
roc_auc_score: 0.616
KNN_K_value =  5
Model Name:  KNeighborsClassifier()
Confusion matrix : 
 [[  83  179]
 [  62 7314]]
Outcome values : 
 83 179 62 7314
Classification report : 
               precision    recall  f1-score   support

           1       0.57      0.32      0.41       262
           0       0.98      0.99      0.98      7376

    accuracy                           0.97      7638
   macro avg       0.77      0.65      0.70      7638
weighted avg       0.96      0.97      0.96      7638

Accuracy : 96.8 %
Precision : 57.2 %
Recall : 31.7 %
F1 Score : 0.408
Specificity or True Negative Rate : 99.2 %
Balanced Accuracy : 65.4 %
MCC : 0.411
roc_auc_score: 0.654
KNN_K_value =  6
Model Name:  KNeighborsClassifier(n_neighbors=6)
Confusion matrix : 
 [[  58  204]
 [  25 7351]]
Outcome values : 
 58 204 25 7351
Classification report : 
               precision    recall  f1-score   support

           1       0.70      0.22      0.34       262
           0       0.97      1.00      0.98      7376

    accuracy                           0.97      7638
   macro avg       0.84      0.61      0.66      7638
weighted avg       0.96      0.97      0.96      7638

Accuracy : 97.0 %
Precision : 69.9 %
Recall : 22.1 %
F1 Score : 0.336
Specificity or True Negative Rate : 99.7 %
Balanced Accuracy : 60.9 %
MCC : 0.383
roc_auc_score: 0.609
KNN_K_value =  7
Model Name:  KNeighborsClassifier(n_neighbors=7)
Confusion matrix : 
 [[  77  185]
 [  41 7335]]
Outcome values : 
 77 185 41 7335
Classification report : 
               precision    recall  f1-score   support

           1       0.65      0.29      0.41       262
           0       0.98      0.99      0.98      7376

    accuracy                           0.97      7638
   macro avg       0.81      0.64      0.70      7638
weighted avg       0.96      0.97      0.96      7638

Accuracy : 97.0 %
Precision : 65.3 %
Recall : 29.4 %
F1 Score : 0.405
Specificity or True Negative Rate : 99.4 %
Balanced Accuracy : 64.4 %
MCC : 0.426
roc_auc_score: 0.644
KNN_K_value =  8
Model Name:  KNeighborsClassifier(n_neighbors=8)
Confusion matrix : 
 [[  50  212]
 [  22 7354]]
Outcome values : 
 50 212 22 7354
Classification report : 
               precision    recall  f1-score   support

           1       0.69      0.19      0.30       262
           0       0.97      1.00      0.98      7376

    accuracy                           0.97      7638
   macro avg       0.83      0.59      0.64      7638
weighted avg       0.96      0.97      0.96      7638

Accuracy : 96.9 %
Precision : 69.4 %
Recall : 19.1 %
F1 Score : 0.299
Specificity or True Negative Rate : 99.7 %
Balanced Accuracy : 59.4 %
MCC : 0.354
roc_auc_score: 0.594
KNN_K_value =  9
Model Name:  KNeighborsClassifier(n_neighbors=9)
Confusion matrix : 
 [[  58  204]
 [  31 7345]]
Outcome values : 
 58 204 31 7345
Classification report : 
               precision    recall  f1-score   support

           1       0.65      0.22      0.33       262
           0       0.97      1.00      0.98      7376

    accuracy                           0.97      7638
   macro avg       0.81      0.61      0.66      7638
weighted avg       0.96      0.97      0.96      7638

Accuracy : 96.9 %
Precision : 65.2 %
Recall : 22.1 %
F1 Score : 0.33
Specificity or True Negative Rate : 99.6 %
Balanced Accuracy : 60.8 %
MCC : 0.368
roc_auc_score: 0.609
KNN_K_value =  10
Model Name:  KNeighborsClassifier(n_neighbors=10)
Confusion matrix : 
 [[  48  214]
 [  16 7360]]
Outcome values : 
 48 214 16 7360
Classification report : 
               precision    recall  f1-score   support

           1       0.75      0.18      0.29       262
           0       0.97      1.00      0.98      7376

    accuracy                           0.97      7638
   macro avg       0.86      0.59      0.64      7638
weighted avg       0.96      0.97      0.96      7638

Accuracy : 97.0 %
Precision : 75.0 %
Recall : 18.3 %
F1 Score : 0.294
Specificity or True Negative Rate : 99.8 %
Balanced Accuracy : 59.0 %
MCC : 0.361
roc_auc_score: 0.591
KNN_K_value =  11
Model Name:  KNeighborsClassifier(n_neighbors=11)
Confusion matrix : 
 [[  55  207]
 [  24 7352]]
Outcome values : 
 55 207 24 7352
Classification report : 
               precision    recall  f1-score   support

           1       0.70      0.21      0.32       262
           0       0.97      1.00      0.98      7376

    accuracy                           0.97      7638
   macro avg       0.83      0.60      0.65      7638
weighted avg       0.96      0.97      0.96      7638

Accuracy : 97.0 %
Precision : 69.6 %
Recall : 21.0 %
F1 Score : 0.323
Specificity or True Negative Rate : 99.7 %
Balanced Accuracy : 60.4 %
MCC : 0.372
roc_auc_score: 0.603
KNN_K_value =  12
Model Name:  KNeighborsClassifier(n_neighbors=12)
Confusion matrix : 
 [[  45  217]
 [  18 7358]]
Outcome values : 
 45 217 18 7358
Classification report : 
               precision    recall  f1-score   support

           1       0.71      0.17      0.28       262
           0       0.97      1.00      0.98      7376

    accuracy                           0.97      7638
   macro avg       0.84      0.58      0.63      7638
weighted avg       0.96      0.97      0.96      7638

Accuracy : 96.9 %
Precision : 71.4 %
Recall : 17.2 %
F1 Score : 0.277
Specificity or True Negative Rate : 99.8 %
Balanced Accuracy : 58.5 %
MCC : 0.341
roc_auc_score: 0.585
KNN_K_value =  13
Model Name:  KNeighborsClassifier(n_neighbors=13)
Confusion matrix : 
 [[  50  212]
 [  21 7355]]
Outcome values : 
 50 212 21 7355
Classification report : 
               precision    recall  f1-score   support

           1       0.70      0.19      0.30       262
           0       0.97      1.00      0.98      7376

    accuracy                           0.97      7638
   macro avg       0.84      0.59      0.64      7638
weighted avg       0.96      0.97      0.96      7638

Accuracy : 96.9 %
Precision : 70.4 %
Recall : 19.1 %
F1 Score : 0.3
Specificity or True Negative Rate : 99.7 %
Balanced Accuracy : 59.4 %
MCC : 0.357
roc_auc_score: 0.594
KNN_K_value =  14
Model Name:  KNeighborsClassifier(n_neighbors=14)
Confusion matrix : 
 [[  41  221]
 [  13 7363]]
Outcome values : 
 41 221 13 7363
Classification report : 
               precision    recall  f1-score   support

           1       0.76      0.16      0.26       262
           0       0.97      1.00      0.98      7376

    accuracy                           0.97      7638
   macro avg       0.87      0.58      0.62      7638
weighted avg       0.96      0.97      0.96      7638

Accuracy : 96.9 %
Precision : 75.9 %
Recall : 15.6 %
F1 Score : 0.259
Specificity or True Negative Rate : 99.8 %
Balanced Accuracy : 57.7 %
MCC : 0.336
roc_auc_score: 0.577
KNN_K_value =  15
Model Name:  KNeighborsClassifier(n_neighbors=15)
Confusion matrix : 
 [[  48  214]
 [  18 7358]]
Outcome values : 
 48 214 18 7358
Classification report : 
               precision    recall  f1-score   support

           1       0.73      0.18      0.29       262
           0       0.97      1.00      0.98      7376

    accuracy                           0.97      7638
   macro avg       0.85      0.59      0.64      7638
weighted avg       0.96      0.97      0.96      7638

Accuracy : 97.0 %
Precision : 72.7 %
Recall : 18.3 %
F1 Score : 0.293
Specificity or True Negative Rate : 99.8 %
Balanced Accuracy : 59.0 %
MCC : 0.355
roc_auc_score: 0.59
KNN_K_value =  16
Model Name:  KNeighborsClassifier(n_neighbors=16)
Confusion matrix : 
 [[  40  222]
 [  12 7364]]
Outcome values : 
 40 222 12 7364
Classification report : 
               precision    recall  f1-score   support

           1       0.77      0.15      0.25       262
           0       0.97      1.00      0.98      7376

    accuracy                           0.97      7638
   macro avg       0.87      0.58      0.62      7638
weighted avg       0.96      0.97      0.96      7638

Accuracy : 96.9 %
Precision : 76.9 %
Recall : 15.3 %
F1 Score : 0.255
Specificity or True Negative Rate : 99.8 %
Balanced Accuracy : 57.6 %
MCC : 0.334
roc_auc_score: 0.576
KNN_K_value =  17
Model Name:  KNeighborsClassifier(n_neighbors=17)
Confusion matrix : 
 [[  45  217]
 [  13 7363]]
Outcome values : 
 45 217 13 7363
Classification report : 
               precision    recall  f1-score   support

           1       0.78      0.17      0.28       262
           0       0.97      1.00      0.98      7376

    accuracy                           0.97      7638
   macro avg       0.87      0.58      0.63      7638
weighted avg       0.96      0.97      0.96      7638

Accuracy : 97.0 %
Precision : 77.6 %
Recall : 17.2 %
F1 Score : 0.281
Specificity or True Negative Rate : 99.8 %
Balanced Accuracy : 58.5 %
MCC : 0.356
roc_auc_score: 0.585
KNN_K_value =  18
Model Name:  KNeighborsClassifier(n_neighbors=18)
Confusion matrix : 
 [[  38  224]
 [  12 7364]]
Outcome values : 
 38 224 12 7364
Classification report : 
               precision    recall  f1-score   support

           1       0.76      0.15      0.24       262
           0       0.97      1.00      0.98      7376

    accuracy                           0.97      7638
   macro avg       0.87      0.57      0.61      7638
weighted avg       0.96      0.97      0.96      7638

Accuracy : 96.9 %
Precision : 76.0 %
Recall : 14.5 %
F1 Score : 0.244
Specificity or True Negative Rate : 99.8 %
Balanced Accuracy : 57.2 %
MCC : 0.324
roc_auc_score: 0.572
KNN_K_value =  19
Model Name:  KNeighborsClassifier(n_neighbors=19)
Confusion matrix : 
 [[  44  218]
 [  14 7362]]
Outcome values : 
 44 218 14 7362
Classification report : 
               precision    recall  f1-score   support

           1       0.76      0.17      0.28       262
           0       0.97      1.00      0.98      7376

    accuracy                           0.97      7638
   macro avg       0.86      0.58      0.63      7638
weighted avg       0.96      0.97      0.96      7638

Accuracy : 97.0 %
Precision : 75.9 %
Recall : 16.8 %
F1 Score : 0.275
Specificity or True Negative Rate : 99.8 %
Balanced Accuracy : 58.3 %
MCC : 0.348
roc_auc_score: 0.583
KNN_K_value =  20
Model Name:  KNeighborsClassifier(n_neighbors=20)
Confusion matrix : 
 [[  36  226]
 [  12 7364]]
Outcome values : 
 36 226 12 7364
Classification report : 
               precision    recall  f1-score   support

           1       0.75      0.14      0.23       262
           0       0.97      1.00      0.98      7376

    accuracy                           0.97      7638
   macro avg       0.86      0.57      0.61      7638
weighted avg       0.96      0.97      0.96      7638

Accuracy : 96.9 %
Precision : 75.0 %
Recall : 13.7 %
F1 Score : 0.232
Specificity or True Negative Rate : 99.8 %
Balanced Accuracy : 56.8 %
MCC : 0.313
roc_auc_score: 0.568
In [63]:
# display the KNNResults
KNNResults
Out[63]:
Model Name KNN K Value True_Positive False_Negative False_Positive True_Negative Accuracy Precision Recall F1 Score Specificity MCC ROC_AUC_Score Balanced Accuracy
0 KNeighborsClassifier(n_neighbors=1) 1 140 122 102 7274 0.971 0.579 0.534 0.556 0.986 0.541 0.760261 0.76
1 KNeighborsClassifier(n_neighbors=2) 2 63 199 43 7333 0.968 0.594 0.24 0.342 0.994 0.365 0.617314 0.617
2 KNeighborsClassifier(n_neighbors=3) 3 106 156 77 7299 0.969 0.579 0.405 0.476 0.99 0.469 0.69707 0.698
3 KNeighborsClassifier(n_neighbors=4) 4 62 200 35 7341 0.969 0.639 0.237 0.345 0.995 0.377 0.615948 0.616
4 KNeighborsClassifier() 5 83 179 62 7314 0.968 0.572 0.317 0.408 0.992 0.411 0.654194 0.654
5 KNeighborsClassifier(n_neighbors=6) 6 58 204 25 7351 0.97 0.699 0.221 0.336 0.997 0.383 0.608992 0.609
6 KNeighborsClassifier(n_neighbors=7) 7 77 185 41 7335 0.97 0.653 0.294 0.405 0.994 0.426 0.644167 0.644
7 KNeighborsClassifier(n_neighbors=8) 8 50 212 22 7354 0.969 0.694 0.191 0.299 0.997 0.354 0.593929 0.594
8 KNeighborsClassifier(n_neighbors=9) 9 58 204 31 7345 0.969 0.652 0.221 0.33 0.996 0.368 0.608586 0.608
9 KNeighborsClassifier(n_neighbors=10) 10 48 214 16 7360 0.97 0.75 0.183 0.294 0.998 0.361 0.590518 0.59
10 KNeighborsClassifier(n_neighbors=11) 11 55 207 24 7352 0.97 0.696 0.21 0.323 0.997 0.372 0.603335 0.604
11 KNeighborsClassifier(n_neighbors=12) 12 45 217 18 7358 0.969 0.714 0.172 0.277 0.998 0.341 0.584658 0.585
12 KNeighborsClassifier(n_neighbors=13) 13 50 212 21 7355 0.969 0.704 0.191 0.3 0.997 0.357 0.593996 0.594
13 KNeighborsClassifier(n_neighbors=14) 14 41 221 13 7363 0.969 0.759 0.156 0.259 0.998 0.336 0.577363 0.577
14 KNeighborsClassifier(n_neighbors=15) 15 48 214 18 7358 0.97 0.727 0.183 0.293 0.998 0.355 0.590383 0.59
15 KNeighborsClassifier(n_neighbors=16) 16 40 222 12 7364 0.969 0.769 0.153 0.255 0.998 0.334 0.575522 0.576
16 KNeighborsClassifier(n_neighbors=17) 17 45 217 13 7363 0.97 0.776 0.172 0.281 0.998 0.356 0.584997 0.585
17 KNeighborsClassifier(n_neighbors=18) 18 38 224 12 7364 0.969 0.76 0.145 0.244 0.998 0.324 0.571706 0.572
18 KNeighborsClassifier(n_neighbors=19) 19 44 218 14 7362 0.97 0.759 0.168 0.275 0.998 0.348 0.58302 0.583
19 KNeighborsClassifier(n_neighbors=20) 20 36 226 12 7364 0.969 0.75 0.137 0.232 0.998 0.313 0.567889 0.568

Naive Bayes Model(Guassianb)Algorithm¶

In [64]:
# Training the Naive Bayes model (GaussianNB) on the Training set

from sklearn.naive_bayes import GaussianNB

modelGNB = GaussianNB(priors=None, var_smoothing=1e-09)

# Fit the model with train data

modelGNB.fit(x_train,y_train)

# Predict the model with test data set

y_pred = modelGNB.predict(x_test)
y_pred_prob = modelGNB.predict_proba(x_test)

# Confusion matrix in sklearn

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# actual values

actual = y_test

# predicted values

predicted = y_pred

# confusion matrix

matrix = confusion_matrix(actual,predicted, labels=[1,0],sample_weight=None, normalize=None)
print('Confusion matrix : \n', matrix)

# outcome values order in sklearn

tp, fn, fp, tn = confusion_matrix(actual,predicted,labels=[1,0]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy

C_Report = classification_report(actual,predicted,labels=[1,0])

print('Classification report : \n', C_Report)

# calculating the metrics

sensitivity = round(tp/(tp+fn), 3);
specificity = round(tn/(tn+fp), 3);
accuracy = round((tp+tn)/(tp+fp+tn+fn), 3);
balanced_accuracy = round((sensitivity+specificity)/2, 3);
precision = round(tp/(tp+fp), 3);
f1Score = round((2*tp/(2*tp + fp + fn)), 3);

# Matthews Correlation Coefficient (MCC). Range of values of MCC lie between -1 to +1. 
# A model with a score of +1 is a perfect model and -1 is a poor model

from math import sqrt

mx = (tp+fp) * (tp+fn) * (tn+fp) * (tn+fn)
MCC = round(((tp * tn) - (fp * fn)) / sqrt(mx), 3)

print('Accuracy :', round(accuracy*100, 2),'%')
print('Precision :', round(precision*100, 2),'%')
print('Recall :', round(sensitivity*100,2), '%')
print('F1 Score :', f1Score)
print('Specificity or True Negative Rate :', round(specificity*100,2), '%'  )
print('Balanced Accuracy :', round(balanced_accuracy*100, 2),'%')
print('MCC :', MCC)

# Area under ROC curve 

from sklearn.metrics import roc_curve, roc_auc_score

print('roc_auc_score:', round(roc_auc_score(actual, predicted), 3))

# ROC Curve

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
model_roc_auc = roc_auc_score(actual, predicted)
fpr, tpr, thresholds = roc_curve(actual,modelGNB.predict_proba(x_test)[:,1])
plt.figure()
# plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot(fpr, tpr, label= 'Classification Model' % model_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show() 
print('-----------------------------------------------------------------------------------------------------')
Confusion matrix : 
 [[ 262    0]
 [2131 5245]]
Outcome values : 
 262 0 2131 5245
Classification report : 
               precision    recall  f1-score   support

           1       0.11      1.00      0.20       262
           0       1.00      0.71      0.83      7376

    accuracy                           0.72      7638
   macro avg       0.55      0.86      0.51      7638
weighted avg       0.97      0.72      0.81      7638

Accuracy : 72.1 %
Precision : 10.9 %
Recall : 100.0 %
F1 Score : 0.197
Specificity or True Negative Rate : 71.1 %
Balanced Accuracy : 85.5 %
MCC : 0.279
roc_auc_score: 0.856
-----------------------------------------------------------------------------------------------------

Support Vector Machines_Linear Kernal (SVM)¶

In [65]:
# Load the results file for  SVM

EMResults1 = pd.read_excel(r"EMResultsNew.xlsx", header=0)

# Display the first 5 records

EMResults1.head()
Out[65]:
Model Name True_Positive False_Negative False_Positive True_Negative Accuracy Precision Recall F1 Score Specificity MCC ROC_AUC_Score Balanced Accuracy
In [66]:
# Training the SVM algorithm with train dataset

from sklearn.svm import SVC

ModelSVM1 = SVC(C=1.0, kernel='linear', degree=3, gamma='scale', coef0=0.0, shrinking=True, 
                probability=True, tol=0.001, cache_size=200, class_weight=None, verbose=False, 
                max_iter=- 1, decision_function_shape='ovr', break_ties=False, random_state=None)

# Train the model with train data 

ModelSVM1 = ModelSVM1.fit(x_train, y_train)

# Predict the model with test data set

y_pred = ModelSVM1.predict(x_test)
y_pred_prob = ModelSVM1.predict_proba(x_test)

# Print the model name
    
print('Model Name: ', "SVM - Linear")

# Confusion matrix in sklearn

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# actual values

actual = y_test

# predicted values

predicted = y_pred

# confusion matrix

matrix = confusion_matrix(actual,predicted, labels=[1,0],sample_weight=None, normalize=None)
print('Confusion matrix : \n', matrix)

# outcome values order in sklearn

tp, fn, fp, tn = confusion_matrix(actual,predicted,labels=[1,0]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy

C_Report = classification_report(actual,predicted,labels=[1,0])

print('Classification report : \n', C_Report)

# calculating the metrics

sensitivity = round(tp/(tp+fn), 3);
specificity = round(tn/(tn+fp), 3);
accuracy = round((tp+tn)/(tp+fp+tn+fn), 3);
balanced_accuracy = round((sensitivity+specificity)/2, 3);
precision = round(tp/(tp+fp), 3);
f1Score = round((2*tp/(2*tp + fp + fn)), 3);

# Matthews Correlation Coefficient (MCC). Range of values of MCC lie between -1 to +1. 
# A model with a score of +1 is a perfect model and -1 is a poor model

from math import sqrt

mx = (tp+fp) * (tp+fn) * (tn+fp) * (tn+fn)
MCC = round(((tp * tn) - (fp * fn)) / sqrt(mx), 3)

print('Accuracy :', round(accuracy*100, 2),'%')
print('Precision :', round(precision*100, 2),'%')
print('Recall :', round(sensitivity*100,2), '%')
print('F1 Score :', f1Score)
print('Specificity or True Negative Rate :', round(specificity*100,2), '%'  )
print('Balanced Accuracy :', round(balanced_accuracy*100, 2),'%')
print('MCC :', MCC)

# Area under ROC curve 

from sklearn.metrics import roc_curve, roc_auc_score

print('roc_auc_score:', round(roc_auc_score(actual, predicted), 3))

# ROC Curve

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
model_roc_auc = roc_auc_score(actual, predicted)
fpr, tpr, thresholds = roc_curve(actual,ModelSVM1.predict_proba(x_test)[:,1])
plt.figure()
# plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot(fpr, tpr, label= 'Classification Model' % model_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show() 
print('-----------------------------------------------------------------------------------------------------')
#---
new_row = {'Model Name' : "SVM - Linear",
            'True_Positive' : tp, 
            'False_Negative' : fn, 
            'False_Positive' : fp,
            'True_Negative' : tn,
            'Accuracy' : accuracy,
            'Precision' : precision,
            'Recall' : sensitivity,
            'F1 Score' : f1Score,
            'Specificity' : specificity,
            'MCC':MCC,
            'ROC_AUC_Score':roc_auc_score(actual, predicted),
            'Balanced Accuracy':balanced_accuracy}
EMResults1 = EMResults1.append(new_row, ignore_index=True)
#-------------------------------------------------------------------------------------------------------------
Model Name:  SVM - Linear
Confusion matrix : 
 [[   0  262]
 [   0 7376]]
Outcome values : 
 0 262 0 7376
Classification report : 
               precision    recall  f1-score   support

           1       0.00      0.00      0.00       262
           0       0.97      1.00      0.98      7376

    accuracy                           0.97      7638
   macro avg       0.48      0.50      0.49      7638
weighted avg       0.93      0.97      0.95      7638

Accuracy : 96.6 %
Precision : nan %
Recall : 0.0 %
F1 Score : 0.0
Specificity or True Negative Rate : 100.0 %
Balanced Accuracy : 50.0 %
MCC : nan
roc_auc_score: 0.5
-----------------------------------------------------------------------------------------------------
In [67]:
# display the EMResults
EMResults1.head()
Out[67]:
Model Name True_Positive False_Negative False_Positive True_Negative Accuracy Precision Recall F1 Score Specificity MCC ROC_AUC_Score Balanced Accuracy
0 SVM - Linear 0 262 0 7376 0.966 NaN 0.0 0.0 1.0 NaN 0.5 0.5

SVM_Polinomial Kernal¶

In [68]:
# Training the SVM algorithm

from sklearn.svm import SVC

ModelSVMPoly = SVC(kernel='poly', degree=2, probability=True)

# Train the model

ModelSVMPoly.fit(x_train, y_train)

# Predict the model with test data set

y_pred = ModelSVMPoly.predict(x_test)
y_pred_prob = ModelSVMPoly.predict_proba(x_test)

# Print the model name
    
print('Model Name: ', "SVM - Polynominal")

# Confusion matrix in sklearn

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# actual values

actual = y_test

# predicted values

predicted = y_pred

# confusion matrix

matrix = confusion_matrix(actual,predicted, labels=[1,0],sample_weight=None, normalize=None)
print('Confusion matrix : \n', matrix)

# outcome values order in sklearn

tp, fn, fp, tn = confusion_matrix(actual,predicted,labels=[1,0]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy

C_Report = classification_report(actual,predicted,labels=[1,0])

print('Classification report : \n', C_Report)

# calculating the metrics

sensitivity = round(tp/(tp+fn), 3);
specificity = round(tn/(tn+fp), 3);
accuracy = round((tp+tn)/(tp+fp+tn+fn), 3);
balanced_accuracy = round((sensitivity+specificity)/2, 3);
precision = round(tp/(tp+fp), 3);
f1Score = round((2*tp/(2*tp + fp + fn)), 3);

# Matthews Correlation Coefficient (MCC). Range of values of MCC lie between -1 to +1. 
# A model with a score of +1 is a perfect model and -1 is a poor model

from math import sqrt

mx = (tp+fp) * (tp+fn) * (tn+fp) * (tn+fn)
MCC = round(((tp * tn) - (fp * fn)) / sqrt(mx), 3)

print('Accuracy :', round(accuracy*100, 2),'%')
print('Precision :', round(precision*100, 2),'%')
print('Recall :', round(sensitivity*100,2), '%')
print('F1 Score :', f1Score)
print('Specificity or True Negative Rate :', round(specificity*100,2), '%'  )
print('Balanced Accuracy :', round(balanced_accuracy*100, 2),'%')
print('MCC :', MCC)

# Area under ROC curve 

from sklearn.metrics import roc_curve, roc_auc_score

print('roc_auc_score:', round(roc_auc_score(y_test, y_pred), 3))

# ROC Curve

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, y_pred)
fpr, tpr, thresholds = roc_curve(y_test,ModelSVMPoly.predict_proba(x_test)[:,1])
plt.figure()
# plt.plot
plt.plot(fpr, tpr, label= 'Classification Model' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show() 
print('-----------------------------------------------------------------------------------------------------')
#---
new_row = {'Model Name' : "SVM - Polynominal",
            'True_Positive' : tp, 
            'False_Negative' : fn, 
            'False_Positive' : fp,
            'True_Negative' : tn,
            'Accuracy' : accuracy,
            'Precision' : precision,
            'Recall' : sensitivity,
            'F1 Score' : f1Score,
            'Specificity' : specificity,
            'MCC':MCC,
            'ROC_AUC_Score':roc_auc_score(actual, predicted),
            'Balanced Accuracy':balanced_accuracy}
EMResults1 = EMResults1.append(new_row, ignore_index=True)
#-----------------------------------------------------------------------------------------------
Model Name:  SVM - Polynominal
Confusion matrix : 
 [[   0  262]
 [   0 7376]]
Outcome values : 
 0 262 0 7376
Classification report : 
               precision    recall  f1-score   support

           1       0.00      0.00      0.00       262
           0       0.97      1.00      0.98      7376

    accuracy                           0.97      7638
   macro avg       0.48      0.50      0.49      7638
weighted avg       0.93      0.97      0.95      7638

Accuracy : 96.6 %
Precision : nan %
Recall : 0.0 %
F1 Score : 0.0
Specificity or True Negative Rate : 100.0 %
Balanced Accuracy : 50.0 %
MCC : nan
roc_auc_score: 0.5
-----------------------------------------------------------------------------------------------------
In [69]:
# display the EMResults
EMResults1.head()
Out[69]:
Model Name True_Positive False_Negative False_Positive True_Negative Accuracy Precision Recall F1 Score Specificity MCC ROC_AUC_Score Balanced Accuracy
0 SVM - Linear 0 262 0 7376 0.966 NaN 0.0 0.0 1.0 NaN 0.5 0.5
1 SVM - Polynominal 0 262 0 7376 0.966 NaN 0.0 0.0 1.0 NaN 0.5 0.5

Gussain Kernal¶

In [70]:
# Training the SVM algorithm

from sklearn.svm import SVC

ModelSVMGaussian = SVC(kernel='rbf', random_state = 42, class_weight='balanced', probability=True)

# Train the model

ModelSVMGaussian.fit(x_train, y_train)

# Predict the model with test data set

y_pred = ModelSVMGaussian.predict(x_test)
y_pred_prob = ModelSVMGaussian.predict_proba(x_test)

# Confusion matrix in sklearn

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Print the model name
    
print('Model Name: ', "SVM - Gaussian")

# actual values

actual = y_test

# predicted values

predicted = y_pred

# confusion matrix

matrix = confusion_matrix(actual,predicted, labels=[1,0],sample_weight=None, normalize=None)
print('Confusion matrix : \n', matrix)

# outcome values order in sklearn

tp, fn, fp, tn = confusion_matrix(actual,predicted,labels=[1,0]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy

C_Report = classification_report(actual,predicted,labels=[1,0])

print('Classification report : \n', C_Report)

# calculating the metrics

sensitivity = round(tp/(tp+fn), 3);
specificity = round(tn/(tn+fp), 3);
accuracy = round((tp+tn)/(tp+fp+tn+fn), 3);
balanced_accuracy = round((sensitivity+specificity)/2, 3);
precision = round(tp/(tp+fp), 3);
f1Score = round((2*tp/(2*tp + fp + fn)), 3);

# Matthews Correlation Coefficient (MCC). Range of values of MCC lie between -1 to +1. 
# A model with a score of +1 is a perfect model and -1 is a poor model

from math import sqrt

mx = (tp+fp) * (tp+fn) * (tn+fp) * (tn+fn)
MCC = round(((tp * tn) - (fp * fn)) / sqrt(mx), 3)

print('Accuracy :', round(accuracy*100, 2),'%')
print('Precision :', round(precision*100, 2),'%')
print('Recall :', round(sensitivity*100,2), '%')
print('F1 Score :', f1Score)
print('Specificity or True Negative Rate :', round(specificity*100,2), '%'  )
print('Balanced Accuracy :', round(balanced_accuracy*100, 2),'%')
print('MCC :', MCC)

# Area under ROC curve 

from sklearn.metrics import roc_curve, roc_auc_score

print('roc_auc_score:', round(roc_auc_score(y_test, y_pred), 3))

# ROC Curve

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, y_pred)
fpr, tpr, thresholds = roc_curve(y_test,ModelSVMGaussian.predict_proba(x_test)[:,1])
plt.figure()
# plt.plot
plt.plot(fpr, tpr, label= 'Classification Model' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show() 
print('-----------------------------------------------------------------------------------------------------')
#---
new_row = {'Model Name' : "SVM - Gaussian",
            'True_Positive' : tp, 
            'False_Negative' : fn, 
            'False_Positive' : fp,
            'True_Negative' : tn,
            'Accuracy' : accuracy,
            'Precision' : precision,
            'Recall' : sensitivity,
            'F1 Score' : f1Score,
            'Specificity' : specificity,
            'MCC':MCC,
            'ROC_AUC_Score':roc_auc_score(actual, predicted),
            'Balanced Accuracy':balanced_accuracy}
EMResults1 = EMResults1.append(new_row, ignore_index=True)
#---------------------------------------------------------------------------------------------------------------
Model Name:  SVM - Gaussian
Confusion matrix : 
 [[ 257    5]
 [1415 5961]]
Outcome values : 
 257 5 1415 5961
Classification report : 
               precision    recall  f1-score   support

           1       0.15      0.98      0.27       262
           0       1.00      0.81      0.89      7376

    accuracy                           0.81      7638
   macro avg       0.58      0.89      0.58      7638
weighted avg       0.97      0.81      0.87      7638

Accuracy : 81.4 %
Precision : 15.4 %
Recall : 98.1 %
F1 Score : 0.266
Specificity or True Negative Rate : 80.8 %
Balanced Accuracy : 89.5 %
MCC : 0.347
roc_auc_score: 0.895
-----------------------------------------------------------------------------------------------------
In [71]:
# display the EMResults
EMResults1.head()
Out[71]:
Model Name True_Positive False_Negative False_Positive True_Negative Accuracy Precision Recall F1 Score Specificity MCC ROC_AUC_Score Balanced Accuracy
0 SVM - Linear 0 262 0 7376 0.966 NaN 0.0 0.0 1.0 NaN 0.5 0.5
1 SVM - Polynominal 0 262 0 7376 0.966 NaN 0.0 0.0 1.0 NaN 0.5 0.5
2 SVM - Gaussian 257 5 1415 5961 0.814 0.154 0.981 0.266 0.808 0.347 0.894539 0.895

Sigmoid Kernal¶

In [72]:
# Training the SVM algorithm

from sklearn.svm import SVC

ModelSVMSig = SVC(kernel='sigmoid', random_state = 42, class_weight='balanced', probability=True)

# Train the model

ModelSVMSig.fit(x_train, y_train)

# Predict the model with test data set

y_pred = ModelSVMSig.predict(x_test)
y_pred_prob = ModelSVMSig.predict_proba(x_test)

# Print the model name
    
print('Model Name: ', "SVM - Sigmoid")

# Confusion matrix in sklearn

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# actual values

actual = y_test

# predicted values

predicted = y_pred

# confusion matrix

matrix = confusion_matrix(actual,predicted, labels=[1,0],sample_weight=None, normalize=None)
print('Confusion matrix : \n', matrix)

# outcome values order in sklearn

tp, fn, fp, tn = confusion_matrix(actual,predicted,labels=[1,0]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy

C_Report = classification_report(actual,predicted,labels=[1,0])

print('Classification report : \n', C_Report)

# calculating the metrics

sensitivity = round(tp/(tp+fn), 3);
specificity = round(tn/(tn+fp), 3);
accuracy = round((tp+tn)/(tp+fp+tn+fn), 3);
balanced_accuracy = round((sensitivity+specificity)/2, 3);
precision = round(tp/(tp+fp), 3);
f1Score = round((2*tp/(2*tp + fp + fn)), 3);

# Matthews Correlation Coefficient (MCC). Range of values of MCC lie between -1 to +1. 
# A model with a score of +1 is a perfect model and -1 is a poor model

from math import sqrt

mx = (tp+fp) * (tp+fn) * (tn+fp) * (tn+fn)
MCC = round(((tp * tn) - (fp * fn)) / sqrt(mx), 3)

print('Accuracy :', round(accuracy*100, 2),'%')
print('Precision :', round(precision*100, 2),'%')
print('Recall :', round(sensitivity*100,2), '%')
print('F1 Score :', f1Score)
print('Specificity or True Negative Rate :', round(specificity*100,2), '%'  )
print('Balanced Accuracy :', round(balanced_accuracy*100, 2),'%')
print('MCC :', MCC)

# Area under ROC curve 

from sklearn.metrics import roc_curve, roc_auc_score

print('roc_auc_score:', round(roc_auc_score(y_test, y_pred), 3))

# ROC Curve

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, y_pred)
fpr, tpr, thresholds = roc_curve(y_test,ModelSVMSig.predict_proba(x_test)[:,1])
plt.figure()
# plt.plot
plt.plot(fpr, tpr, label= 'Classification Model' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show() 
print('-----------------------------------------------------------------------------------------------------')
#---
new_row = {'Model Name' : "SVM - Sigmoid",
            'True_Positive' : tp, 
            'False_Negative' : fn, 
            'False_Positive' : fp,
            'True_Negative' : tn,
            'Accuracy' : accuracy,
            'Precision' : precision,
            'Recall' : sensitivity,
            'F1 Score' : f1Score,
            'Specificity' : specificity,
            'MCC':MCC,
            'ROC_AUC_Score':roc_auc_score(actual, predicted),
            'Balanced Accuracy':balanced_accuracy}
EMResults1 = EMResults1.append(new_row, ignore_index=True)
#-----------------------------------------------------------------------------------------------------------
Model Name:  SVM - Sigmoid
Confusion matrix : 
 [[ 158  104]
 [3675 3701]]
Outcome values : 
 158 104 3675 3701
Classification report : 
               precision    recall  f1-score   support

           1       0.04      0.60      0.08       262
           0       0.97      0.50      0.66      7376

    accuracy                           0.51      7638
   macro avg       0.51      0.55      0.37      7638
weighted avg       0.94      0.51      0.64      7638

Accuracy : 50.5 %
Precision : 4.1 %
Recall : 60.3 %
F1 Score : 0.077
Specificity or True Negative Rate : 50.2 %
Balanced Accuracy : 55.2 %
MCC : 0.038
roc_auc_score: 0.552
-----------------------------------------------------------------------------------------------------
In [73]:
# display the EMRseults
EMResults1.head()
Out[73]:
Model Name True_Positive False_Negative False_Positive True_Negative Accuracy Precision Recall F1 Score Specificity MCC ROC_AUC_Score Balanced Accuracy
0 SVM - Linear 0 262 0 7376 0.966 NaN 0.0 0.0 1.0 NaN 0.5 0.5
1 SVM - Polynominal 0 262 0 7376 0.966 NaN 0.0 0.0 1.0 NaN 0.5 0.5
2 SVM - Gaussian 257 5 1415 5961 0.814 0.154 0.981 0.266 0.808 0.347 0.894539 0.895
3 SVM - Sigmoid 158 104 3675 3701 0.505 0.041 0.603 0.077 0.502 0.038 0.552408 0.552

Compare with the Classification algorithm¶

In [74]:
# load the concrete dataset

EMResults=pd.read_excel(r"EMResultsNew.xlsx",header=0)


# display the first 5 records
EMResults.head(10)
Out[74]:
Model Name True_Positive False_Negative False_Positive True_Negative Accuracy Precision Recall F1 Score Specificity MCC ROC_AUC_Score Balanced Accuracy
In [76]:
# Build the Calssification models and compare the results

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

# Create objects of classification algorithms with default hyper-parameters

ModelLR = LogisticRegression()
ModelDC = DecisionTreeClassifier()
ModelRF = RandomForestClassifier()
ModelET = ExtraTreesClassifier()
ModelKNN = KNeighborsClassifier(n_neighbors=1)
ModelGNB = GaussianNB()
ModelSVMGaussian = SVC(kernel='rbf', random_state = 42, class_weight='balanced', probability=True)



# Evalution matrix for all the algorithms

#MM = [ModelLR, ModelDC, ModelRF, ModelET, ModelKNN, ModelGNB, ModelSVM]
MM = [ModelLR, ModelDC, ModelRF, ModelET, ModelKNN, ModelGNB, ModelSVMGaussian]
for models in MM:
            
    # Train the model training dataset
    
    models.fit(x_train, y_train)
    
    # Prediction the model with test dataset
    
    y_pred = models.predict(x_test)
    y_pred_prob = models.predict_proba(x_test)
    
    # Print the model name
    
    print('Model Name: ', models)
    
    # confusion matrix in sklearn

    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import classification_report

    # actual values

    actual = y_test

    # predicted values

    predicted = y_pred

    # confusion matrix

    matrix = confusion_matrix(actual,predicted, labels=[1,0],sample_weight=None, normalize=None)
    print('Confusion matrix : \n', matrix)

    # outcome values order in sklearn

    tp, fn, fp, tn = confusion_matrix(actual,predicted,labels=[1,0]).reshape(-1)
    print('Outcome values : \n', tp, fn, fp, tn)

    # classification report for precision, recall f1-score and accuracy

    C_Report = classification_report(actual,predicted,labels=[1,0])

    print('Classification report : \n', C_Report)

    # calculating the metrics

    sensitivity = round(tp/(tp+fn), 3);
    specificity = round(tn/(tn+fp), 3);
    accuracy = round((tp+tn)/(tp+fp+tn+fn), 3);
    balanced_accuracy = round((sensitivity+specificity)/2, 3);
    
    precision = round(tp/(tp+fp), 3);
    f1Score = round((2*tp/(2*tp + fp + fn)), 3);

    # Matthews Correlation Coefficient (MCC). Range of values of MCC lie between -1 to +1. 
    # A model with a score of +1 is a perfect model and -1 is a poor model

    from math import sqrt

    mx = (tp+fp) * (tp+fn) * (tn+fp) * (tn+fn)
    MCC = round(((tp * tn) - (fp * fn)) / sqrt(mx), 3)

    print('Accuracy :', round(accuracy*100, 2),'%')
    print('Precision :', round(precision*100, 2),'%')
    print('Recall :', round(sensitivity*100,2), '%')
    print('F1 Score :', f1Score)
    print('Specificity or True Negative Rate :', round(specificity*100,2), '%'  )
    print('Balanced Accuracy :', round(balanced_accuracy*100, 2),'%')
    print('MCC :', MCC)

    # Area under ROC curve 

    from sklearn.metrics import roc_curve, roc_auc_score

    print('roc_auc_score:', round(roc_auc_score(actual, predicted), 3))
    
    # ROC Curve
    
    from sklearn.metrics import roc_auc_score
    from sklearn.metrics import roc_curve
    Model_roc_auc = roc_auc_score(actual, predicted)
    fpr, tpr, thresholds = roc_curve(actual, models.predict_proba(x_test)[:,1])
    plt.figure()
    #
    plt.plot(fpr, tpr, label= 'Classification Model' % Model_roc_auc)
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.savefig('Log_ROC')
    plt.show()
    print('-----------------------------------------------------------------------------------------------------')
    #----------------------------------------------------------------------------------------------------------
    new_row = {'Model Name' : models,
               'True_Positive': tp,
               'False_Negative': fn, 
               'False_Positive': fp, 
               'True_Negative': tn,
               'Accuracy' : accuracy,
               'Precision' : precision,
               'Recall' : sensitivity,
               'F1 Score' : f1Score,
               'Specificity' : specificity,
               'MCC':MCC,
               'ROC_AUC_Score':roc_auc_score(actual, predicted),
               'Balanced Accuracy':balanced_accuracy}
    EMResults = EMResults.append(new_row, ignore_index=True)
    #------------------------------------------------------------------------------
Model Name:  LogisticRegression()
Confusion matrix : 
 [[   0  262]
 [   0 7376]]
Outcome values : 
 0 262 0 7376
Classification report : 
               precision    recall  f1-score   support

           1       0.00      0.00      0.00       262
           0       0.97      1.00      0.98      7376

    accuracy                           0.97      7638
   macro avg       0.48      0.50      0.49      7638
weighted avg       0.93      0.97      0.95      7638

Accuracy : 96.6 %
Precision : nan %
Recall : 0.0 %
F1 Score : 0.0
Specificity or True Negative Rate : 100.0 %
Balanced Accuracy : 50.0 %
MCC : nan
roc_auc_score: 0.5
-----------------------------------------------------------------------------------------------------
Model Name:  DecisionTreeClassifier()
Confusion matrix : 
 [[ 165   97]
 [ 191 7185]]
Outcome values : 
 165 97 191 7185
Classification report : 
               precision    recall  f1-score   support

           1       0.46      0.63      0.53       262
           0       0.99      0.97      0.98      7376

    accuracy                           0.96      7638
   macro avg       0.73      0.80      0.76      7638
weighted avg       0.97      0.96      0.97      7638

Accuracy : 96.2 %
Precision : 46.3 %
Recall : 63.0 %
F1 Score : 0.534
Specificity or True Negative Rate : 97.4 %
Balanced Accuracy : 80.2 %
MCC : 0.521
roc_auc_score: 0.802
-----------------------------------------------------------------------------------------------------
Model Name:  RandomForestClassifier()
Confusion matrix : 
 [[ 146  116]
 [  43 7333]]
Outcome values : 
 146 116 43 7333
Classification report : 
               precision    recall  f1-score   support

           1       0.77      0.56      0.65       262
           0       0.98      0.99      0.99      7376

    accuracy                           0.98      7638
   macro avg       0.88      0.78      0.82      7638
weighted avg       0.98      0.98      0.98      7638

Accuracy : 97.9 %
Precision : 77.2 %
Recall : 55.7 %
F1 Score : 0.647
Specificity or True Negative Rate : 99.4 %
Balanced Accuracy : 77.6 %
MCC : 0.646
roc_auc_score: 0.776
-----------------------------------------------------------------------------------------------------
Model Name:  ExtraTreesClassifier()
Confusion matrix : 
 [[ 128  134]
 [  52 7324]]
Outcome values : 
 128 134 52 7324
Classification report : 
               precision    recall  f1-score   support

           1       0.71      0.49      0.58       262
           0       0.98      0.99      0.99      7376

    accuracy                           0.98      7638
   macro avg       0.85      0.74      0.78      7638
weighted avg       0.97      0.98      0.97      7638

Accuracy : 97.6 %
Precision : 71.1 %
Recall : 48.9 %
F1 Score : 0.579
Specificity or True Negative Rate : 99.3 %
Balanced Accuracy : 74.1 %
MCC : 0.578
roc_auc_score: 0.741
-----------------------------------------------------------------------------------------------------
Model Name:  KNeighborsClassifier(n_neighbors=1)
Confusion matrix : 
 [[ 140  122]
 [ 102 7274]]
Outcome values : 
 140 122 102 7274
Classification report : 
               precision    recall  f1-score   support

           1       0.58      0.53      0.56       262
           0       0.98      0.99      0.98      7376

    accuracy                           0.97      7638
   macro avg       0.78      0.76      0.77      7638
weighted avg       0.97      0.97      0.97      7638

Accuracy : 97.1 %
Precision : 57.9 %
Recall : 53.4 %
F1 Score : 0.556
Specificity or True Negative Rate : 98.6 %
Balanced Accuracy : 76.0 %
MCC : 0.541
roc_auc_score: 0.76
-----------------------------------------------------------------------------------------------------
Model Name:  GaussianNB()
Confusion matrix : 
 [[ 262    0]
 [2131 5245]]
Outcome values : 
 262 0 2131 5245
Classification report : 
               precision    recall  f1-score   support

           1       0.11      1.00      0.20       262
           0       1.00      0.71      0.83      7376

    accuracy                           0.72      7638
   macro avg       0.55      0.86      0.51      7638
weighted avg       0.97      0.72      0.81      7638

Accuracy : 72.1 %
Precision : 10.9 %
Recall : 100.0 %
F1 Score : 0.197
Specificity or True Negative Rate : 71.1 %
Balanced Accuracy : 85.5 %
MCC : 0.279
roc_auc_score: 0.856
-----------------------------------------------------------------------------------------------------
Model Name:  SVC(class_weight='balanced', probability=True, random_state=42)
Confusion matrix : 
 [[ 257    5]
 [1415 5961]]
Outcome values : 
 257 5 1415 5961
Classification report : 
               precision    recall  f1-score   support

           1       0.15      0.98      0.27       262
           0       1.00      0.81      0.89      7376

    accuracy                           0.81      7638
   macro avg       0.58      0.89      0.58      7638
weighted avg       0.97      0.81      0.87      7638

Accuracy : 81.4 %
Precision : 15.4 %
Recall : 98.1 %
F1 Score : 0.266
Specificity or True Negative Rate : 80.8 %
Balanced Accuracy : 89.5 %
MCC : 0.347
roc_auc_score: 0.895
-----------------------------------------------------------------------------------------------------

RESULTS¶

In [77]:
# display the EMResults
EMResults.head(10)
Out[77]:
Model Name True_Positive False_Negative False_Positive True_Negative Accuracy Precision Recall F1 Score Specificity MCC ROC_AUC_Score Balanced Accuracy
0 LogisticRegression() 0 262 0 7376 0.966 NaN 0.0 0.0 1.0 NaN 0.5 0.5
1 DecisionTreeClassifier() 165 97 191 7185 0.962 0.463 0.63 0.534 0.974 0.521 0.801938 0.802
2 (DecisionTreeClassifier(max_features='auto', r... 146 116 43 7333 0.979 0.772 0.557 0.647 0.994 0.646 0.775711 0.776
3 (ExtraTreeClassifier(random_state=1137466445),... 128 134 52 7324 0.976 0.711 0.489 0.579 0.993 0.578 0.74075 0.741
4 KNeighborsClassifier(n_neighbors=1) 140 122 102 7274 0.971 0.579 0.534 0.556 0.986 0.541 0.760261 0.76
5 GaussianNB() 262 0 2131 5245 0.721 0.109 1.0 0.197 0.711 0.279 0.855545 0.855
6 SVC(class_weight='balanced', probability=True,... 257 5 1415 5961 0.814 0.154 0.981 0.266 0.808 0.347 0.894539 0.895

PREDICATION OF ALGORITHM¶

In [78]:
#predict the values with knn algorithm
y_predKNN = ModelRF.predict(x_test)
In [79]:
#create data frame with actual vs predict values
# display the final results

Results=pd.DataFrame({'is_preparatory_A':y_test,'is_preparatory_P':y_pred})

# Merge two dataframes on the index of both the dataframe

ResultsFinal=data_bk2.merge(Results,left_index=True,right_index=True)


# display 5 records randomly

ResultsFinal.sample(5)
Out[79]:
id year institute_type round_no quota pool institute_short program_name program_duration degree_short category opening_rank closing_rank is_preparatory is_preparatory_A is_preparatory_P
12463 12464 2021 IIT 2 AI Gender-Neutral IIT-Kharagpur Economics 4 Years BSc GEN-EWS 821 981 0 0 0
11193 11194 2021 IIT 1 AI Gender-Neutral IIT-Jodhpur Bio Engineering 4 Years B.Tech SC 2418 2724 0 0 1
14497 18679 2019 NIT 7 HS Gender-Neutral NIT-Agartala Mathematics and Computing 5 Years Int M.Tech SC 27407 42264 0 0 0
18430 23141 2020 NIT 6 OS Female-Only NIT-Agartala Chemical Engineering 4 Years B.Tech GEN-EWS 9670 9670 0 0 0
16739 20920 2019 NIT 7 HS Gender-Neutral NIT-Rourkela Chemical Engineering 5 Years Btech + M.Tech (IDD) GEN 33535 36735 0 0 0
In [ ]: